-
Notifications
You must be signed in to change notification settings - Fork 238
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Bugfix for batched gemv #2481
Bugfix for batched gemv #2481
Conversation
Shouldn't CUDA.jl/lib/cublas/wrappers.jl Lines 426 to 438 in 8b54f85
Can you add a test that covers the case that doesn't work right now, and works after the change? |
No, according to the official cuBLAS documentation, definitions of For
For
For |
I will try to add some tests this week. |
See also the CUDA.jl/lib/cublas/wrappers.jl Lines 378 to 384 in bbe625b
|
@maleadt A similar bug was found on |
LGTM, let's just ping the original author of these functions: @lpawela |
@maleadt What is the status of this PR? |
It hangs on me, sorry. I'll have a look within a couple of days. |
I have problems launching tests on this patch.
when launching julia> CUDA.versioninfo()
CUDA runtime 12.6, artifact installation
CUDA driver 12.4
NVIDIA driver 550.90.7
CUDA libraries:
- CUBLAS: 12.6.0
- CURAND: 10.3.7
- CUFFT: 11.2.6
- CUSOLVER: 11.6.4
- CUSPARSE: 12.5.2
- CUPTI: 2024.3.0 (API 24.0.0)
- NVML: 12.0.0+550.90.7
Julia packages:
- CUDA: 5.5.0
- CUDA_Driver_jll: 0.10.0+0
- CUDA_Runtime_jll: 0.15.1+0
Toolchain:
- Julia: 1.10.2
- LLVM: 15.0.7
1 device:
0: NVIDIA GeForce RTX 3080 (sm_86, 7.857 GiB / 10.000 GiB available) |
The changes in this PR seem to triggering some illegal memory access. |
I'm seeing similar issues locally, but I'm having a hard time isolating the problem. Many times, the |
Actually, some more testing today reveals that the illegal memory access I was seeing locally comes from a different test. @lpawela I cannot reproduce the isolated failure of the using CUDA.CUBLAS, GPUArrays
using CUDA, Test, LinearAlgebra
using Adapt
struct ArrayAdaptor{AT} end
Adapt.adapt_storage(::ArrayAdaptor{AT}, xs::AbstractArray) where {AT} = AT(xs)
test_result(a::Number, b::Number; kwargs...) = ≈(a, b; kwargs...)
test_result(a::Missing, b::Missing; kwargs...) = true
test_result(a::Number, b::Missing; kwargs...) = false
test_result(a::Missing, b::Number; kwargs...) = false
function test_result(a::AbstractArray{T}, b::AbstractArray{T}; kwargs...) where {T<:Number}
≈(collect(a), collect(b); kwargs...)
end
function test_result(a::AbstractArray{T}, b::AbstractArray{T};
kwargs...) where {T<:NTuple{N,<:Number} where {N}}
ET = eltype(T)
≈(reinterpret(ET, collect(a)), reinterpret(ET, collect(b)); kwargs...)
end
function test_result(as::NTuple{N,Any}, bs::NTuple{N,Any}; kwargs...) where {N}
all(zip(as, bs)) do (a, b)
test_result(a, b; kwargs...)
end
end
function compare(f, AT::Type{<:AbstractGPUArray}, xs...; kwargs...)
# copy on the CPU, adapt on the GPU, but keep Ref's
cpu_in = map(x -> isa(x, Base.RefValue) ? x[] : deepcopy(x), xs)
gpu_in = map(x -> isa(x, Base.RefValue) ? x[] : adapt(ArrayAdaptor{AT}(), x), xs)
cpu_out = f(cpu_in...)
gpu_out = f(gpu_in...)
test_result(cpu_out, gpu_out; kwargs...)
end
function compare(f, AT::Type{<:Array}, xs...; kwargs...)
# no need to actually run this tests: we have nothing to compoare against,
# and we'll run it on a CPU array anyhow when comparing to a GPU array.
#
# this method exists so that we can at least run the test suite with Array,
# and make sure we cover other tests (that don't call `compare`) too.
return true
end
testf(f, xs...; kwargs...) = compare(f, CuArray, xs...; kwargs...)
m = 20
n = 35
k = 13
@testset for elty in [Float32, Float64, ComplexF32, ComplexF64]
alpha = rand(elty)
beta = rand(elty)
@testset "gemv" begin
@test testf(*, rand(elty, m, n), rand(elty, n))
@test testf(*, transpose(rand(elty, m, n)), rand(elty, m))
@test testf(*, rand(elty, m, n)', rand(elty, m))
x = rand(elty, m)
A = rand(elty, m, m + 1 )
y = rand(elty, n)
dx = CuArray(x)
dA = CuArray(A)
dy = CuArray(y)
@test_throws DimensionMismatch mul!(dy, dA, dx)
A = rand(elty, m + 1, m )
dA = CuArray(A)
@test_throws DimensionMismatch mul!(dy, dA, dx)
x = rand(elty, m)
A = rand(elty, n, m)
dx = CuArray(x)
dA = CuArray(A)
alpha = rand(elty)
dy = CUBLAS.gemv('N', alpha, dA, dx)
hy = collect(dy)
@test hy ≈ alpha * A * x
dy = CUBLAS.gemv('N', dA, dx)
hy = collect(dy)
@test hy ≈ A * x
dy = CuArray(y)
dx = CUBLAS.gemv(elty <: Real ? 'T' : 'C', alpha, dA, dy)
hx = collect(dx)
@test hx ≈ alpha * A' * y
end
if CUBLAS.version() >= v"11.9"
@testset "gemv_batched" begin
x = [rand(elty, m) for i=1:10]
A = [rand(elty, n, m) for i=1:10]
y = [rand(elty, n) for i=1:10]
dx = CuArray{elty, 1}[]
dA = CuArray{elty, 2}[]
dy = CuArray{elty, 1}[]
dbad = CuArray{elty, 1}[]
for i=1:length(A)
push!(dA, CuArray(A[i]))
push!(dx, CuArray(x[i]))
push!(dy, CuArray(y[i]))
if i < length(A) - 2
push!(dbad,CuArray(dx[i]))
end
end
@test_throws DimensionMismatch CUBLAS.gemv_batched!('N', alpha, dA, dx, beta, dbad)
CUBLAS.gemv_batched!('N', alpha, dA, dx, beta, dy)
for i=1:length(A)
hy = collect(dy[i])
y[i] = alpha * A[i] * x[i] + beta * y[i]
@test y[i] ≈ hy
end
dy = CuArray{elty, 1}[]
for i=1:length(A)
push!(dy, CuArray(y[i]))
end
CUBLAS.gemv_batched!(elty <: Real ? 'T' : 'C', alpha, dA, dy, beta, dx)
for i=1:size(A, 3)
hx = collect(dx[i])
x[i] = alpha * A[i]' * y[i] + beta * x[i]
@test x[i] ≈ hx
end
end
end
if CUBLAS.version() >= v"11.9"
@testset "gemv_strided_batched" begin
x = rand(elty, m, 10)
A = rand(elty, n, m, 10)
y = rand(elty, n, 10)
bad = rand(elty, m, 10)
dx = CuArray(x)
dA = CuArray(A)
dy = CuArray(y)
dbad = CuArray(bad)
@test_throws DimensionMismatch CUBLAS.gemv_strided_batched!('N', alpha, dA, dx, beta, dbad)
bad = rand(elty, n, 2)
dbad = CuArray(bad)
@test_throws DimensionMismatch CUBLAS.gemv_strided_batched!('N', alpha, dA, dx, beta, dbad)
CUBLAS.gemv_strided_batched!('N', alpha, dA, dx, beta, dy)
for i=1:size(A, 3)
hy = collect(dy[:, i])
y[:, i] = alpha * A[:, :, i] * x[:, i] + beta * y[:, i]
@test y[:, i] ≈ hy
end
dy = CuArray(y)
CUBLAS.gemv_strided_batched!(elty <: Real ? 'T' : 'C', alpha, dA, dy, beta, dx)
for i=1:size(A, 3)
hx = collect(dx[:, i])
x[:, i] = alpha * A[:, :, i]' * y[:, i] + beta * x[:, i]
@test x[:, i] ≈ hx
end
end
end
end |
I still get the same error, even on a different machine. The command
|
Okay, thanks for confirming! Marked as draft until we figure out the exact issue here. EDIT: does the isolated reproduces also give the same error? |
7ec9719
to
c5571cf
Compare
5d585c4
to
c850163
Compare
@kose-y Is it possible to rebase this PR? |
test/libraries/cublas/level2.jl
Outdated
push!(dy, CuArray(y[i])) | ||
end | ||
CUBLAS.gemv_batched!(elty <: Real ? 'T' : 'C', alpha, dA, dy, beta, dx) | ||
for i=1:length(A) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Previous bug: This line was for i = 1:size(A, 3)
, where A
is a Vector
of CuMatrix
.
Not sure if that is related to illegal memory access.
I have attempted to rebase and, in the process, fixed a bug in the test code. |
Codecov ReportAttention: Patch coverage is
Additional details and impacted files@@ Coverage Diff @@
## master #2481 +/- ##
==========================================
+ Coverage 73.25% 73.37% +0.12%
==========================================
Files 157 157
Lines 15272 15268 -4
==========================================
+ Hits 11187 11203 +16
+ Misses 4085 4065 -20 ☔ View full report in Codecov by Sentry. |
Your PR requires formatting changes to meet the project's style guidelines. Click here to view the suggested changes.diff --git a/test/libraries/cublas/level2.jl b/test/libraries/cublas/level2.jl
index ce65052c4..f93e79b04 100644
--- a/test/libraries/cublas/level2.jl
+++ b/test/libraries/cublas/level2.jl
@@ -77,7 +77,7 @@ k = 13
@test y[i] ≈ hy
end
dy = CuArray{elty, 1}[]
- for i=1:length(A)
+ for i in 1:length(A)
push!(dy, CuArray(y[i]))
end
CUBLAS.gemv_batched!(elty <: Real ? 'T' : 'C', alpha, dA, dy, beta, dx) |
Thanks @kose-y! |
OK, here you go. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
CUDA.jl Benchmarks
Benchmark suite | Current: 2e316ac | Previous: 24c236a | Ratio |
---|---|---|---|
latency/precompile |
46304515857 ns |
46329110276 ns |
1.00 |
latency/ttfp |
6988044874 ns |
6970088608 ns |
1.00 |
latency/import |
3648625168 ns |
3628117061 ns |
1.01 |
integration/volumerhs |
9622426 ns |
9622596.5 ns |
1.00 |
integration/byval/slices=1 |
146790 ns |
147068 ns |
1.00 |
integration/byval/slices=3 |
425208 ns |
425521 ns |
1.00 |
integration/byval/reference |
144849.5 ns |
144904 ns |
1.00 |
integration/byval/slices=2 |
286097 ns |
286164 ns |
1.00 |
integration/cudadevrt |
103234 ns |
103422 ns |
1.00 |
kernel/indexing |
14150 ns |
14042 ns |
1.01 |
kernel/indexing_checked |
14761 ns |
14520 ns |
1.02 |
kernel/occupancy |
643.0441176470588 ns |
633.7076023391813 ns |
1.01 |
kernel/launch |
2048.1499999999996 ns |
2101.9 ns |
0.97 |
kernel/rand |
17039 ns |
14379 ns |
1.18 |
array/reverse/1d |
19865 ns |
19719 ns |
1.01 |
array/reverse/2d |
24149 ns |
25121 ns |
0.96 |
array/reverse/1d_inplace |
10495 ns |
11160 ns |
0.94 |
array/reverse/2d_inplace |
11919 ns |
13070 ns |
0.91 |
array/copy |
21546 ns |
21024 ns |
1.02 |
array/iteration/findall/int |
151894.5 ns |
155786 ns |
0.98 |
array/iteration/findall/bool |
137497 ns |
134662.5 ns |
1.02 |
array/iteration/findfirst/int |
153659 ns |
147148 ns |
1.04 |
array/iteration/findfirst/bool |
153940 ns |
154167.5 ns |
1.00 |
array/iteration/scalar |
60775 ns |
61499 ns |
0.99 |
array/iteration/logical |
209734.5 ns |
203811.5 ns |
1.03 |
array/iteration/findmin/1d |
39163.5 ns |
39639 ns |
0.99 |
array/iteration/findmin/2d |
94362 ns |
94387 ns |
1.00 |
array/reductions/reduce/1d |
41562.5 ns |
30507 ns |
1.36 |
array/reductions/reduce/2d |
47023.5 ns |
51213.5 ns |
0.92 |
array/reductions/mapreduce/1d |
39516.5 ns |
30459 ns |
1.30 |
array/reductions/mapreduce/2d |
51401.5 ns |
51487 ns |
1.00 |
array/broadcast |
20832 ns |
20729 ns |
1.00 |
array/copyto!/gpu_to_gpu |
13807 ns |
11560 ns |
1.19 |
array/copyto!/cpu_to_gpu |
210029 ns |
208930 ns |
1.01 |
array/copyto!/gpu_to_cpu |
244002 ns |
241934 ns |
1.01 |
array/accumulate/1d |
109245.5 ns |
108332 ns |
1.01 |
array/accumulate/2d |
81223 ns |
80060 ns |
1.01 |
array/construct |
1291.7 ns |
1258.3 ns |
1.03 |
array/random/randn/Float32 |
47413 ns |
42993 ns |
1.10 |
array/random/randn!/Float32 |
27000 ns |
26226 ns |
1.03 |
array/random/rand!/Int64 |
26953 ns |
26878 ns |
1.00 |
array/random/rand!/Float32 |
8504 ns |
8569 ns |
0.99 |
array/random/rand/Int64 |
38226 ns |
29957 ns |
1.28 |
array/random/rand/Float32 |
13232 ns |
12962 ns |
1.02 |
array/permutedims/4d |
61212 ns |
61080 ns |
1.00 |
array/permutedims/2d |
55505.5 ns |
55180 ns |
1.01 |
array/permutedims/3d |
56559 ns |
55780 ns |
1.01 |
array/sorting/1d |
2776294 ns |
2774685 ns |
1.00 |
array/sorting/by |
3367591 ns |
3367411.5 ns |
1.00 |
array/sorting/2d |
1085256 ns |
1085055 ns |
1.00 |
cuda/synchronization/stream/auto |
1022.0666666666667 ns |
1048.8 ns |
0.97 |
cuda/synchronization/stream/nonblocking |
6388.4 ns |
6455.4 ns |
0.99 |
cuda/synchronization/stream/blocking |
807.9901960784314 ns |
846.986301369863 ns |
0.95 |
cuda/synchronization/context/auto |
1193.5 ns |
1230.1 ns |
0.97 |
cuda/synchronization/context/nonblocking |
6679.4 ns |
6670.8 ns |
1.00 |
cuda/synchronization/context/blocking |
916.921052631579 ns |
928 ns |
0.99 |
This comment was automatically generated by workflow using github-action-benchmark.
Fix incorrect definition of m and n in gemv_strided_batched!